UHF_zipcode =
read_csv("./data/appb.csv") %>%
slice(-43) %>%
select(-Borough) %>%
rename("UHF" = "UHF Neighborhood") %>%
janitor::clean_names()
raw_hiv =
GET("https://data.cityofnewyork.us/api/views/fju2-rdad/rows.csv") %>%
content("parsed") %>%
janitor::clean_names()
combine_hiv =
right_join(UHF_zipcode, raw_hiv, by = "uhf") %>%
janitor::clean_names() %>%
separate(zip_code, into = c("zipcode1", "zipcode2", "zipcode3",
"zipcode4", "zipcode5", "zipcode6", "zipcode7", "zipcode8",
"zipcode9"), sep = ", ") %>%
gather(key = zip_code, value = zipcode_value, zipcode1:zipcode9) %>%
filter(!is.na(zipcode_value)) %>%
rename("zipcode" = "zipcode_value") %>%
select(zipcode, everything(), -zip_code)
r = GET('http://data.beta.nyc//dataset/3bf5fb73-edb5-4b05-bb29-7c95f4a727fc/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90/download/f4129d9aa6dd4281bc98d0f701629b76nyczipcodetabulationareas.geojson')
nyc_zipcode = readOGR(content(r,'text'), 'OGRGeoJSON', verbose = F)
zipcode_lat_lng = nyc_zipcode@data %>%
select(zipcode = postalCode, longitude, latitude) %>%
mutate(zipcode = as.character(zipcode))
combine_hiv1 =
full_join(zipcode_lat_lng, combine_hiv, by = "zipcode") %>%
mutate(longitude = as.numeric(longitude), latitude = as.numeric(latitude)) %>%
group_by(uhf) %>%
summarise(lng = mean(longitude),
lat = mean(latitude)) %>%
filter(!(uhf == "Pelhem - Throgs Neck"))
pums_raw = read_csv("./data/selected_pums.csv")
temp = tempfile(fileext = ".xls")
dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/nyc_zcta10_to_puma10.xls"
download.file(dataURL, destfile = temp, mode = 'wb')
zcta_to_puma = readxl::read_xls(temp, sheet = 2) %>%
select(zcta = zcta10, puma = puma10) %>%
mutate(puma = as.numeric(puma))
dataURL = "http://faculty.baruch.cuny.edu/geoportal/resources/nyc_geog/zip_to_zcta10_nyc_revised.xls"
download.file(dataURL, destfile = temp, mode = 'wb')
zip_to_zcta = readxl::read_xls(temp, sheet = 2) %>%
select(zipcode, zcta = zcta5)
pums_data =
pums_raw %>%
select(puma = PUMA10, income = PINCP, year = ADJINC) %>%
filter(puma != -9) # remove data from 2011 due to lack of area information
pums_data$year = recode(pums_data$year,
"1042852" = "2012",
"1025215" = "2013",
"1009585" = "2014",
"1001264" = "2015")
pums_data =
pums_data %>%
group_by(year, puma) %>%
summarise(mid_income = median(income, na.rm = TRUE)) %>%
ungroup() # calculate yearly median income for each area
puma_to_zipcode = right_join(zip_to_zcta, zcta_to_puma, by = "zcta") %>% # generaate a puma to zipcode file
select(puma, zipcode)
income_zipcode = right_join(pums_data, puma_to_zipcode, by = "puma") %>% # matching zipcode with median income data
select(year, zipcode, mid_income) %>%
mutate(year = as.numeric(year))
combine_hiv_income =
left_join(combine_hiv, income_zipcode, by = c("year", "zipcode"))
income_hiv = combine_hiv_income
income_hiv %>%
filter(year != "2011" & age != "All") %>%
lm(hiv_diagnoses ~ borough + gender + age + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.983 | 0.302 | 3.252 | 0.001 |
| boroughBrooklyn | 0.297 | 0.281 | 1.060 | 0.289 |
| boroughManhattan | 3.091 | 0.331 | 9.332 | 0.000 |
| boroughQueens | -1.245 | 0.259 | -4.811 | 0.000 |
| boroughStaten Island | -4.376 | 0.397 | -11.016 | 0.000 |
| genderMale | 6.083 | 0.152 | 40.138 | 0.000 |
| age20 - 29 | 9.600 | 0.262 | 36.576 | 0.000 |
| age30 - 39 | 6.870 | 0.262 | 26.175 | 0.000 |
| age40 - 49 | 4.627 | 0.262 | 17.627 | 0.000 |
| age50 - 59 | 2.355 | 0.262 | 8.972 | 0.000 |
| age60+ | 0.427 | 0.262 | 1.626 | 0.104 |
| mid_income | 0.000 | 0.000 | -17.851 | 0.000 |
income_hiv %>%
filter(year != "2011" & race != "All") %>%
lm(hiv_diagnoses ~ borough + gender + race + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1.514 | 0.514 | 2.945 | 0.003 |
| boroughBrooklyn | 0.357 | 0.493 | 0.724 | 0.469 |
| boroughManhattan | 3.710 | 0.582 | 6.376 | 0.000 |
| boroughQueens | -1.494 | 0.454 | -3.287 | 0.001 |
| boroughStaten Island | -5.251 | 0.698 | -7.527 | 0.000 |
| genderMale | 7.299 | 0.266 | 27.425 | 0.000 |
| raceBlack | 10.932 | 0.421 | 25.978 | 0.000 |
| raceLatino/Hispanic | 9.027 | 0.421 | 21.451 | 0.000 |
| raceOther/Unknown | -1.380 | 0.421 | -3.278 | 0.001 |
| raceWhite | 3.628 | 0.421 | 8.621 | 0.000 |
| mid_income | 0.000 | 0.000 | -12.197 | 0.000 |
income_plot = income_hiv %>%
filter(year != "2011") %>%
group_by(uhf, year) %>%
summarise(sum_hiv = mean(hiv_diagnoses), mid_in = median(mid_income)) %>%
ggplot(aes(x = mid_in, y = sum_hiv, color = year)) +
geom_point() +
geom_smooth(method = lm) +
theme_bw() +
theme(legend.position = "None") +
labs(
title = "Income Influence on HIV Incidence",
x = "Average income of each neighborhood",
y = "HIV diagnoses",
caption = "Data from the ..."
)
ggplotly(income_plot)
Interpretation:
According to the scatterplot of income vs HIV diagnoses, it is obvious that the points are mostly concentrated in low income neigbourhood and the number of HIV diagnoses in high average income area( > 60000/year) centered in less than 1000 cases/neigbourhood. This result is exactly what we expected, because low-income community are tend to have insufficient healthcare supply, less insurance coverage, poor education level and inadequate epidemiology awareness, which could jointly cause the relatively high HIV incidence. So, we intend to advocate the related authorities to spare more public health resource in low-to-median-income neigbourhood to raise public awareness of HIV prevention knowledge and increase budgets of medical facilities in those areas.
Limitations:
We can not visualize the effect of age and race at the same time.
income_hiv %>%
filter(year != "2011" & age != "All") %>%
lm(hiv_diagnosis_rate ~ borough + gender + age + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 17.851 | 1.511 | 11.811 | 0.000 |
| boroughBrooklyn | -12.078 | 1.403 | -8.611 | 0.000 |
| boroughManhattan | 15.424 | 1.656 | 9.316 | 0.000 |
| boroughQueens | -22.620 | 1.293 | -17.492 | 0.000 |
| boroughStaten Island | -30.524 | 1.985 | -15.377 | 0.000 |
| genderMale | 39.822 | 0.757 | 52.582 | 0.000 |
| age20 - 29 | 44.060 | 1.312 | 33.589 | 0.000 |
| age30 - 39 | 33.215 | 1.312 | 25.322 | 0.000 |
| age40 - 49 | 27.986 | 1.312 | 21.336 | 0.000 |
| age50 - 59 | 13.841 | 1.312 | 10.552 | 0.000 |
| age60+ | -4.261 | 1.312 | -3.249 | 0.001 |
| mid_income | -0.001 | 0.000 | -18.326 | 0.000 |
income_hiv %>%
filter(year != "2011" & race != "All") %>%
lm(hiv_diagnosis_rate ~ borough + gender + race + mid_income, data = .) %>%
summary() %>%
broom::tidy() %>%
knitr::kable(digits = 3)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 0.795 | 2.459 | 0.323 | 0.747 |
| boroughBrooklyn | -11.951 | 2.357 | -5.070 | 0.000 |
| boroughManhattan | 22.135 | 2.783 | 7.955 | 0.000 |
| boroughQueens | -25.218 | 2.173 | -11.603 | 0.000 |
| boroughStaten Island | -31.251 | 3.336 | -9.367 | 0.000 |
| genderMale | 49.480 | 1.273 | 38.875 | 0.000 |
| raceBlack | 61.810 | 2.012 | 30.713 | 0.000 |
| raceLatino/Hispanic | 34.404 | 2.012 | 17.095 | 0.000 |
| raceOther/Unknown | 9.809 | 2.012 | 4.874 | 0.000 |
| raceWhite | 12.984 | 2.012 | 6.452 | 0.000 |
| mid_income | 0.000 | 0.000 | -1.729 | 0.084 |
income_plot_diag_rate = income_hiv %>%
filter(year != "2011") %>%
group_by(uhf, year) %>%
summarise(sum_hiv_diagnosis_rate = sum(hiv_diagnosis_rate), mid_in = median(mid_income)) %>%
ggplot(aes(x = mid_in, y = sum_hiv_diagnosis_rate, color = year)) +
geom_point() +
geom_smooth(method = lm) +
theme_bw() +
theme(legend.position = "None")
ggplotly(income_plot_diag_rate)